{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "%matplotlib inline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n===============================================================\nModel selection with Probabilistic PCA and Factor Analysis (FA)\n===============================================================\n\nProbabilistic PCA and Factor Analysis are probabilistic models.\nThe consequence is that the likelihood of new data can be used\nfor model selection and covariance estimation.\nHere we compare PCA and FA with cross-validation on low rank data corrupted\nwith homoscedastic noise (noise variance\nis the same for each feature) or heteroscedastic noise (noise variance\nis the different for each feature). In a second step we compare the model\nlikelihood to the likelihoods obtained from shrinkage covariance estimators.\n\nOne can observe that with homoscedastic noise both FA and PCA succeed\nin recovering the size of the low rank subspace. The likelihood with PCA\nis higher than FA in this case. However PCA fails and overestimates\nthe rank when heteroscedastic noise is present. Under appropriate\ncircumstances the low rank models are more likely than shrinkage models.\n\nThe automatic estimation from\nAutomatic Choice of Dimensionality for PCA. NIPS 2000: 598-604\nby Thomas P. Minka is also compared.\n\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "# Authors: Alexandre Gramfort\n#          Denis A. Engemann\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom scipy import linalg\n\nfrom sklearn.decomposition import PCA, FactorAnalysis\nfrom sklearn.covariance import ShrunkCovariance, LedoitWolf\nfrom sklearn.model_selection import cross_val_score\nfrom sklearn.model_selection import GridSearchCV\n\nprint(__doc__)\n\n# #############################################################################\n# Create the data\n\nn_samples, n_features, rank = 1000, 50, 10\nsigma = 1.\nrng = np.random.RandomState(42)\nU, _, _ = linalg.svd(rng.randn(n_features, n_features))\nX = np.dot(rng.randn(n_samples, rank), U[:, :rank].T)\n\n# Adding homoscedastic noise\nX_homo = X + sigma * rng.randn(n_samples, n_features)\n\n# Adding heteroscedastic noise\nsigmas = sigma * rng.rand(n_features) + sigma / 2.\nX_hetero = X + rng.randn(n_samples, n_features) * sigmas\n\n# #############################################################################\n# Fit the models\n\nn_components = np.arange(0, n_features, 5)  # options for n_components\n\n\ndef compute_scores(X):\n    pca = PCA(svd_solver='full')\n    fa = FactorAnalysis()\n\n    pca_scores, fa_scores = [], []\n    for n in n_components:\n        pca.n_components = n\n        fa.n_components = n\n        pca_scores.append(np.mean(cross_val_score(pca, X)))\n        fa_scores.append(np.mean(cross_val_score(fa, X)))\n\n    return pca_scores, fa_scores\n\n\ndef shrunk_cov_score(X):\n    shrinkages = np.logspace(-2, 0, 30)\n    cv = GridSearchCV(ShrunkCovariance(), {'shrinkage': shrinkages})\n    return np.mean(cross_val_score(cv.fit(X).best_estimator_, X))\n\n\ndef lw_score(X):\n    return np.mean(cross_val_score(LedoitWolf(), X))\n\n\nfor X, title in [(X_homo, 'Homoscedastic Noise'),\n                 (X_hetero, 'Heteroscedastic Noise')]:\n    pca_scores, fa_scores = compute_scores(X)\n    n_components_pca = n_components[np.argmax(pca_scores)]\n    n_components_fa = n_components[np.argmax(fa_scores)]\n\n    pca = PCA(svd_solver='full', n_components='mle')\n    pca.fit(X)\n    n_components_pca_mle = pca.n_components_\n\n    print(\"best n_components by PCA CV = %d\" % n_components_pca)\n    print(\"best n_components by FactorAnalysis CV = %d\" % n_components_fa)\n    print(\"best n_components by PCA MLE = %d\" % n_components_pca_mle)\n\n    plt.figure()\n    plt.plot(n_components, pca_scores, 'b', label='PCA scores')\n    plt.plot(n_components, fa_scores, 'r', label='FA scores')\n    plt.axvline(rank, color='g', label='TRUTH: %d' % rank, linestyle='-')\n    plt.axvline(n_components_pca, color='b',\n                label='PCA CV: %d' % n_components_pca, linestyle='--')\n    plt.axvline(n_components_fa, color='r',\n                label='FactorAnalysis CV: %d' % n_components_fa,\n                linestyle='--')\n    plt.axvline(n_components_pca_mle, color='k',\n                label='PCA MLE: %d' % n_components_pca_mle, linestyle='--')\n\n    # compare with other covariance estimators\n    plt.axhline(shrunk_cov_score(X), color='violet',\n                label='Shrunk Covariance MLE', linestyle='-.')\n    plt.axhline(lw_score(X), color='orange',\n                label='LedoitWolf MLE' % n_components_pca_mle, linestyle='-.')\n\n    plt.xlabel('nb of components')\n    plt.ylabel('CV scores')\n    plt.legend(loc='lower right')\n    plt.title(title)\n\nplt.show()"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.9"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}